import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import plotly.express as px
meta = pd.read_csv('lgg_lb_meta.csv')
meta = meta.set_index(['SDG_ID'])
#meta
df = pd.read_csv('lb_plasma_matrix.csv')
tdf = df.T
tdf.columns = tdf.iloc[0]
tdf = tdf[1:]
#tdf
main_df = pd.concat([meta, tdf], axis=1, join="inner")
#main_df
short_histology_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Tumor_Subtype', 'Relapse', 'Survival_Status'],axis=1)
short_histology_df
| Short_Histology | CTRL_ANT1 | CTRL_ANT2 | CTRL_ANT3 | CTRL_ANT4 | CTRL_ANT5 | CTRL_miR_POS | HK_ACTB | HK_B2M | HK_GAPDH | ... | miR-944 | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15635-37 | LGG | 16 | 16 | 22 | 25 | 5 | 20225 | 27 | 109 | 6245 | ... | 23 | 44 | 31 | 19 | 21 | 125 | 38 | 891 | 75 | 290 |
| 15635-43 | HGG | 35 | 60 | 49 | 60 | 47 | 51017 | 30 | 103 | 12707 | ... | 44 | 83 | 76 | 37 | 37 | 95 | 96 | 1876 | 78 | 662 |
| 15635-45 | HGG | 122 | 139 | 145 | 95 | 127 | 16810 | 145 | 1535 | 3414 | ... | 79 | 179 | 212 | 179 | 113 | 430 | 111 | 1141 | 102 | 1034 |
| 15635-46 | LGG | 41 | 60 | 99 | 68 | 49 | 58350 | 95 | 239 | 29270 | ... | 11 | 99 | 172 | 67 | 35 | 62 | 118 | 1972 | 143 | 596 |
| 15635-53 | HGG | 6 | 20 | 3 | 27 | 5 | 20365 | 39 | 302 | 6065 | ... | 13 | 51 | 27 | 7 | 20 | 316 | 26 | 2028 | 110 | 540 |
| 15635-60 | LGG | 12 | 68 | 43 | 102 | 35 | 87035 | 58 | 199 | 19675 | ... | 53 | 98 | 127 | 28 | 104 | 20 | 123 | 1020 | 174 | 370 |
| 15635-68 | LGG | 99 | 86 | 36 | 99 | 60 | 39440 | 79 | 748 | 15361 | ... | 61 | 190 | 162 | 98 | 75 | 256 | 67 | 1842 | 168 | 1263 |
| 15635-80 | LGG | 56 | 87 | 97 | 461 | 47 | 7490 | 77 | 150 | 17922 | ... | 68 | 82 | 142 | 65 | 55 | 263 | 37 | 377 | 217 | 960 |
| 15635-87 | HGG | 76 | 77 | 97 | 88 | 98 | 44954 | 45 | 262 | 4624 | ... | 117 | 201 | 202 | 47 | 65 | 297 | 55 | 3270 | 248 | 896 |
| 15635-90 | LGG | 69 | 52 | 76 | 110 | 68 | 20213 | 67 | 175 | 5244 | ... | 40 | 84 | 56 | 81 | 73 | 168 | 46 | 3492 | 155 | 282 |
| 15635-100 | LGG | 0 | 17 | 26 | 20 | 18 | 38373 | 195 | 3374 | 17375 | ... | 0 | 57 | 62 | 24 | 22 | 120 | 72 | 3586 | 73 | 919 |
| 15635-101 | LGG | 39 | 30 | 21 | 33 | 23 | 19047 | 17 | 86 | 8125 | ... | 39 | 62 | 92 | 39 | 27 | 126 | 33 | 1508 | 64 | 254 |
| 15635-127 | LGG | 16 | 6 | 19 | 73 | 16 | 24390 | 33 | 304 | 8055 | ... | 7 | 28 | 50 | 38 | 10 | 113 | 27 | 865 | 44 | 335 |
| 15635-134 | LGG | 28 | 50 | 19 | 64 | 4 | 47361 | 302 | 1665 | 12094 | ... | 60 | 147 | 104 | 40 | 31 | 180 | 75 | 2621 | 154 | 1349 |
| 15635-154 | HGG | 9 | 30 | 3 | 26 | 11 | 22451 | 5 | 55 | 2794 | ... | 8 | 92 | 65 | 15 | 52 | 259 | 29 | 985 | 69 | 467 |
| 15635-156 | HGG | 45 | 14 | 36 | 15 | 23 | 37931 | 75 | 837 | 2349 | ... | 46 | 220 | 67 | 0 | 0 | 307 | 40 | 8387 | 128 | 1737 |
| 15635-239 | HGG | 24 | 24 | 77 | 92 | 24 | 23781 | 105 | 223 | 306 | ... | 44 | 110 | 94 | 49 | 27 | 340 | 53 | 2902 | 189 | 823 |
17 rows × 2103 columns
# Split the dataset into features and labels
sh_X = short_histology_df.loc[:, short_histology_df.columns != 'Short_Histology'].values
sh_y = short_histology_df.loc[:, short_histology_df.columns == 'Short_Histology'].values.ravel()
# Split data into training and testing set
sh_X_train, sh_X_test, sh_y_train, sh_y_test = train_test_split(sh_X, sh_y, test_size=0.25, random_state=42)
#Sanity check
print(sh_X_train.shape, sh_X_test.shape, sh_y_train.shape, sh_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
# Class Imbalance
fig = px.histogram(short_histology_df, x='Short_Histology')
fig.show()
# Initialize random forest classifier
sh_rf = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
sh_rf.fit(sh_X_train, sh_y_train)
# Make predictions using random forest classifier
sh_rf_y_pred = sh_rf.predict(sh_X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(sh_y_test, sh_rf_y_pred)}')
Accuracy: 0.6
# Calculate a confusion matrix
sh_cm = confusion_matrix(sh_y_test, sh_rf_y_pred, labels=sh_rf.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(sh_cm, text_auto=True,
labels=dict(x="True Subtype", y="Predicted Subtype", color="Productivity"),
x=short_histology_df['Short_Histology'].unique().tolist(),
y=short_histology_df['Short_Histology'].unique().tolist()
)
disp.show()
# What are the most important features?
# rfc2_feature_list = _df.columns
# rfc2_feature_list = rfc2_feature_list.drop('class')
# rfc2_imp_features = pd.Series(rfc2.feature_importances_, index=rfc2_feature_list)
# rfc2_imp_genes = rfc2_imp_features.sort_values(ascending=False).to_frame().reset_index()
# rfc2_imp_genes.columns = ["features", "importance"]
# rfc2_imp_genes_fil = rfc2_imp_genes[~(rfc2_imp_genes == 0.000000).any(axis=1)]
# rfc2_imp_genes_fil
# Display interactive Barplot of important miRNAs
# fig = px.bar(rfc2_imp_genes_fil, x=rfc2_imp_genes_fil.features, y=rfc2_imp_genes_fil.importance)
# fig.show()
relapse_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Short_Histology', 'Tumor_Subtype', 'Survival_Status'],axis=1)
#relapse_df
# Split the dataset into features and labels
r_X = relapse_df.loc[:, relapse_df.columns != 'Relapse'].values
r_y = relapse_df.loc[:, relapse_df.columns == 'Relapse'].values.ravel()
# Split data into training and testing set
r_X_train, r_X_test, r_y_train, r_y_test = train_test_split(r_X, r_y, test_size=0.25, random_state=42)
#Sanity check
print(r_X_train.shape, r_X_test.shape, r_y_train.shape, r_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
# Class Imbalance
fig = px.histogram(relapse_df, x='Relapse')
fig.show()
# Initialize random forest classifier
r_rf = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
r_rf.fit(r_X_train, r_y_train)
# Make predictions using random forest classifier
r_rf_y_pred = r_rf.predict(r_X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(r_y_test, r_rf_y_pred)}')
Accuracy: 0.8
# Calculate a confusion matrix
r_cm = confusion_matrix(r_y_test, r_rf_y_pred, labels=r_rf.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(r_cm, text_auto=True,
labels=dict(x="True Relapse", y="Predicted Relapse", color="Productivity"),
x=relapse_df['Relapse'].unique().tolist(),
y=relapse_df['Relapse'].unique().tolist()
)
disp.show()